data gathered from https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
Here I am doing a few things:
I made a row for each status, including a new active status
I got rid of all columns except Country, Status, Value, Date
I added a new region called “Global”, this holds all the values for every country combined
options(stringsAsFactors=FALSE)
library(plyr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(plotly)
library(highcharter)
#function for grabbing status from my filenames
getStatus<- function(x){
strsplit(x, "-")[[1]] %>%
last() %>%
gsub(pattern="\\.csv", replacement="")
}
#function created for adding an active status
createActive <- function(x){
dcast(Country.Region + Date ~ Status,
data=x, value.var="Value") %>%
mutate(Active = Confirmed - (Deaths + Recovered)) %>%
melt(id = c("Country.Region", "Date"))
}
#function used to convert a dataset from long to wide
convert <- function(x){ dcast(Country + Date ~ Status,
data=x, value.var="Value")}
###########################################
### This is where I start cleaning the data
files <- list.files("raw", full.names = TRUE)
raw <- files %>% #read in files
lapply(function(x){
read.csv(x) %>%
mutate(
Date = as.Date(Date, "%m/%d/%Y"),
Status = getStatus(x) #add status
)
}) %>%
bind_rows() %>% #combine files
subset(!(Value==0) )
raw <- raw %>% #combine countries into one
group_by(Country.Region, Date, Status) %>%
summarise(
Value = sum(Value)
)
raw <- raw %>% #get global stats
group_by(Date, Status) %>%
summarise(
Value = sum(Value)
) %>%
ungroup() %>%
mutate(
Country.Region = "Global"
) %>%
bind_rows(raw)
raw <- raw %>% #create active columns, delete nulls, rename
createActive() %>%
subset(!is.na(value)) %>%
rename(
Country = Country.Region,
Value = value,
Status = variable
)
total <- raw %>% #Get total values for seperate df
group_by(Country, Status) %>%
summarise(
Value = max(Value)
) %>%
ungroup() %>%
mutate(
Status = as.character(Status)
)
#get change per day
raw <- raw %>%
group_by(Country, Status) %>%
mutate(
Change = Value - lag(Value, k=1),
Rate_Change = (Value - lag(Value, k=1))/lag(Value, k=1)
) %>%
ungroup() %>%
mutate(
Status = as.character(Status)
)Now that the data is clean, lets start digging into the data!
Lets start with looking at the overall feel of our top 5 Countries.
plot_data <- total %>%
subset(Country %in% c("US", "China", "Italy", "Germany", "Spain")) %>%
mutate(
Cases = Value
)
p <- plot_data %>%
ggplot(aes(reorder(Status,Cases), Cases, fill = Country),
text = paste("Cases:", Cases))+
geom_bar(stat="identity")+ coord_flip()+ xlab("Status of Case")+ ylab("Total Cases")+ ggtitle("Number of Cases for top 5 countries")
p <- ggplotly(p)
pThe above graph shows the differences of cumulative cases by type, and by country. We can see some pretty interesting things here. For example, Italy has the highest deaths and active cases, yet they are only second in the number of confirmed cases. This could mean a few things:
Italy may not have been testing enough people, which would start to bring their Mortality Rate down.
Italys’ healthcare system is getting overrun, and their deaths are about to skyrocket.
total %>%
subset(Country %in% c("US", "China", "Italy", "Germany", "Spain")) %>%
subset(Status == "Deaths") %>%
hchart("treemap",
colorByPoint = TRUE,
hcaes(x = Country, value = Value)) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Total Deaths by Country",
align = "left")This graph perfectly shows just how many Deaths Italy has compared to our other four countries. At the time of writing this, Italy has just about doubled their death count over Chinas’!
For the remainder of this analysis, I will be focusing on Italy.
Theres a couple things to keep in mind when looking at the rate of change. For example,
It has more variance in the beginning.
As the number of cases get larger, the rate will tend to even out, or even decline.
This is the basic Exponential growth function:
The rate is what gets exponentiated for the function, ie, if our rate is higher, our confirmed cases will get much higher with each day. Also, if the rate is lower that means our cases per day has dropped from the previous days’ number. If there starts to be a trend of consecutively lower rates of change, this means that the disease is starting to slow down, which be indicitive of being past the peak.
plot_data <- raw %>%
subset(Country == "Italy") %>%
subset(!(is.na(Rate_Change))) %>%
subset(Rate_Change <= 1)
p_c = plot_data %>%
subset(Status == "Confirmed")
p_d = plot_data %>%
subset(Status == "Deaths")
p_a = plot_data %>%
subset(Status == "Active")
hchart(density(p_a$Rate_Change), type = "area", color = "yellow", name = "Rate of Active") %>%
hc_add_series(density(p_c$Rate_Change), type = "area", name = "Rate of Confirmed", color = "green") %>%
hc_add_series(density(p_d$Rate_Change), type = "area", name = "Rate of Deaths", color = "red") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Distribion of the Rate of Change, per Day",
align = "left") %>%
hc_subtitle(text = "For Italy",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Rate of Change"))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Rate of Change: ' + this.y)}"))This graph plainly shows that the Deaths rate of change has not only a higher average, but most of its’ rates are higher than the rate of confirmed. This tells us the number of deaths are going up much faster than our confirmed cases are.
There are a multitude of reasons why this could happen:
The Confirmed cases are lagging behind, due to the incubation period.
The Confirmed cases are innacurate, as finding this true value is difficult.
Our Mortality rate is not static,
The Rates of change will tend to even out over time, while will really be shown in the Changes of the Rates themselves.
This will be important for how we build the Monte Carlo Algorithm.
raw %>%
subset(Country == "Italy") %>%
subset(!(is.na(Rate_Change))) %>%
subset(Status == "Deaths") %>%
hchart( type = "scatter",
hcaes(y = Rate_Change, x = Change),
color = "red",
regression = TRUE,
borderColor = '#EBBA95',
borderRadius = 10,
borderWidth = 2) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_add_dependency("plugins/highcharts-regression.js") %>%
hc_title(text = "Rate of Change vs Actual Change",
align = "left") %>%
hc_subtitle(text = "For Italys' Deaths",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"),
title = list(text = "Rate of change")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Cases per Day")) %>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Rate of Change: ' + this.y + ' <br> Cases per Day: ' + this.x)}"))Looking at this scatter plot, as the number of Deceased cases rise, the Rate of Change trends downwards. This means that Italy has a negative correlation between the actual change in cases per day and the rate of change. This is a good thing, as it could mean a few things:
The infection rate is starting to slow down.
The rate of change is starting to average out to a lower number.
Lets find out how well correlated the two actually are, using pearsons r method.
Before we start, I will get the outliers out of the data, as some of the early rates as those are innacurate, for the reasons described before.
get_dr_c <- function(data){
#Calulating the change in our rates of change
data <- data %>%
mutate(
D_RC_C = Rate_Change - lag(Rate_Change,k=1)
)
return (data)
}
corr <- raw %>% #subsetting our data
subset(Country == "Italy") %>%
subset(!(is.na(Rate_Change))) %>%
subset(!(is.na(Change))) %>%
subset(Status == "Deaths", select = c(Value, Change, Rate_Change))
rc_sd = sd(corr$Rate_Change) #standard deviation for rate of change
rc_mean = mean(corr$Rate_Change) #mean for rate of change
cd_sd = sd(corr$Change) #standard deviation for cases per day
cd_mean = mean(corr$Change) #mean for cases per day
corr <- corr %>% #removing outliers
subset(rc_mean-rc_sd*3 <= Rate_Change | Rate_Change <= rc_mean+rc_sd*3) %>%
subset(cd_mean-cd_sd*3 <= Change | Change <= cd_mean+cd_sd*3) %>% subset(Rate_Change < 1) #we have a few outliers that need to go
std_corr <- corr %>%
mutate(
Deaths_RC = (Rate_Change - rc_mean)/rc_sd
) %>%
subset(Rate_Change <= 1) %>% #we have a few outliers that need to go
get_dr_c() %>%
subset(!is.na(D_RC_C))Now that that is done, lets check to see if our Rate of Change distribution is relatively normal.
Lets use a shapiro Wilk Test to check its normality. But first, I will explain what the test is:
*The null-hypothesis of this test is that the population is normally distributed. + Thus, if the p value is less than the chosen alpha level, - then the null hypothesis is rejected and there is evidence that the data tested are not normally distributed. -likewise, if the p value is greater than the alpha level, then the data is normally distributed!
In plain terms, this is the slope of the QQ plot, over the variance. As the slope of the QQ plot approaches the Variance, ie, the distribution reaches normality, W will approach 1.
Thus, a number close to 1 indicates a higher level of normality!
##
## Shapiro-Wilk normality test
##
## data: std_corr$Rate_Change
## W = 0.94503, p-value = 0.1483
Awesome! this tells us that the p value is 0.1483 which is greater than 0.05 (a 95% confidence interval)!
This is great news! As it means that the null hypothesis is rejected! Telling us two things:
The Change in Rates Distribution is not stastically different from a normal distribution.
The W value is 0.945 which is less than 1, but only by 0.05497,
In summary, this means that the Rate of Change, in the Deaths, can be used as a predictor for the Monte Carlo Simulator!
Lets take a look into Pearsons r, this will tell us how correlated the data is, ie, how well fit the mean line is compared to the actual data.
corr <- corr %>%
get_dr_c()
corr %>%
subset(!is.na(D_RC_C)) %>%
as.matrix() %>%
cor() %>% #get correlation
round(3) %>% #round
hchart() %>% #graph
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Pearsons r matchup",
align = "left") %>%
hc_subtitle(text = "For Italys' Deceased Cases",
align = "left") %>%
hc_xAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change", "Change in Rate")) %>%
hc_yAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change", "Change in Rate"))%>%
hc_legend(align = "left") %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))As expected there is a negative correlation between the Rate of Change and Total cases, aswell as a negative correlation between the Rate of Change and the cases per day.
This makes sense, and indicates that as the amount of Cumulative Deaths increase the Rate of Change will start to decrease aswell. This is great news for a few reasons:
This ultimately suggests that the rate of Deaths per day is slowing down
This suggests that the Rate of Change is starting to normalize, causing less drastic changes in the over changes in these Rates.
The opposite is true with the positive r values. For exampe, the r value between the Cumulative Deaths and the Cases per day is 0.938! This highly suggests that as the CUmulative Deaths start to increase, so will the Number of Cases per day.
That is about all the information we can get out of r, so lets take a loot at r squared!
corr %>%
subset(!is.na(D_RC_C)) %>%
as.matrix() %>%
cor() %>% #get correlation
'^'(2) %>% #square numbers
round(3) %>% #round
hchart() %>% #graph
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Pearsons r squared matchup",
align = "left") %>%
hc_subtitle(text = "For Italys' Deceased Cases",
align = "left") %>%
hc_xAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change", "Change in Rate")) %>%
hc_yAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change", "Change in Rate"))%>%
hc_legend(align = "left") %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))The r squared value can tell us much more, for example, there is a r^2 value of 0.88 between the Cumulative deaths and the Cases per day. This means 88.7% of the Cases per day can be explained by our Total cases, or vice versa! This goes on for all of our relatively high r^2 values, showing a strong correlation between a lot of the different calculations.
Let’s check if these values are statistically significant, to do that we need to do the below test.
Null Hypothesis:
Alternate Hypotheses:
This test requires the following formula:
df = nrow(corr)-2 #our degree of freedom
cv = qt(.975, df) #our critical value
r <- corr %>% #correlating our data for r values
subset(!is.na(D_RC_C)) %>%
rename(Deaths = Value) %>%
as.matrix() %>%
cor()
call_ts <- function(r, df){#function for getting the test statistic
ts= abs((r*(df)^.5)/(1-(r)^2)^.5) #formula
return (ts)
}
#changing our matrix to be the test statistics
ts <- call_ts(r, df)
ts## Deaths Change Rate_Change D_RC_C
## Deaths Inf 14.0777415 3.230726 0.01932788
## Change 14.07774150 Inf 2.035128 0.98035389
## Rate_Change 3.23072584 2.0351284 Inf 5.15919843
## D_RC_C 0.01932788 0.9803539 5.159198 Inf
These are the test statistics for each r value, now we need to use the below formula to check if the r value is statistically significant.
Here is a matrix of our r values.
## Deaths Change Rate_Change D_RC_C
## Deaths 1.000000000 0.9381351 -0.5280149 0.003719627
## Change 0.938135074 1.0000000 -0.3646870 0.185398322
## Rate_Change -0.528014886 -0.3646870 1.0000000 0.704578921
## D_RC_C 0.003719627 0.1853983 0.7045789 1.000000000
This tells us that our all our correlations are statistically significant! Which means that we can use all of our factors as a predictor of the other.
This is very important for building the algorithm to the Monte Carlo Simulator.
There are a few things I want to point out about Covid-19:
Using these facts, we can estimate the average time from point of infection to death is 23 days (5 incubation days + 18 days till death). This means that if someone dies today from Covid-19 we can estimate that they got infected 23 days prior. This will help us later on for assessing the total amount of Infected Cases.
These are the reasons it is best to use the Deaths as the base predictor for the Monte Carlo Simulator.
https://www.uptodate.com/contents/coronavirus-disease-2019-covid-19?source=history_widget
https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30566-3/fulltext
///////////////////////////////// /// START REWORDING HERE /////////////////////////////////
Now that the correlations have been tested against eachother, the process for the algorithm can begin!
This algorithm will be broken down into the following steps:
Find the Cumulative Deaths.
Find the Deaths per day.
Find the Rate of change of the Deaths per day.
Find the Changes in those rates.
Predict the future change in rate,
Great, now once the values are predicted, the algorithm will need to math the Change in Rates back to the next Rate of Change, then the Deaths per day, then finally the predicted Cumulative Deaths!
We are calculating this one day at a time, which means our t value will be 1, giving us this:
In plain terms, the next Cumulative Death total is equal to the previous Cumulative Death total times, 1 plus our current rate.
We also know a few things:
Meaning, current Rate is equal to the change in the current rate plus the old rate.
Using the above we get; the change in the Cumulative Deaths (deaths per day) is equal to the previous Cumulative Deaths time the current Rate.
Thus, the current Cumulative Death is equal to the change in our Cumulative Death plus the previous Cumualtive Death total.
To start, there are a few things to note about how I built the algorithm for this analysis:
Lets dive in by taking a look at how our rates of change work!
Here I will go into a more in-depth analysis of the Change in Rates, in order to prep it for the algorithm!
To start, the Rate of Change can be formulated by:
In plain terms, this formula takes the change per day of our deaths, and divides that by our previous days Cumulative Deaths! Making the Deaths per day into a percentage of the Cumulative Deaths!
split_to_country <- function(data, country){
#take data and split by country
#first function for model
country_data <- data %>%
subset(Country == country) %>% #getting italy
convert() %>% #function converts to wide format
subset(select = c(Country, Date, Deaths, Confirmed)) %>%
mutate( #adding columns for per day change, and rate per day
Deaths_Per_Day = Deaths - lag(Deaths, k=1),
Deaths_Rate_Change = (Deaths - lag(Deaths, k=1))/lag(Deaths, k=1),
Mortality_Rate = Deaths/Confirmed
)
#cleaning data for further analysis
country_data <- country_data %>%
subset(select = c(Date, Deaths, Deaths_Per_Day, Deaths_Rate_Change)) %>%
subset(!is.na(Deaths_Per_Day)) %>%
mutate(
Deaths_PD = Deaths_Per_Day,
Deaths_RC = Deaths_Rate_Change
) %>%
subset(select = c(Date, Deaths, Deaths_PD, Deaths_RC))
return (country_data)
}
italy <- raw %>%
split_to_country("Italy")
#showing tail values
tail(italy,5)## Date Deaths Deaths_PD Deaths_RC
## 27 2020-03-19 3405 427 0.1433848
## 28 2020-03-20 4032 627 0.1841410
## 29 2020-03-21 4825 793 0.1966766
## 30 2020-03-22 5476 651 0.1349223
## 31 2020-03-23 6077 601 0.1097516
Now that this step is complete, the change these rates can be calculated. Essentially looking at how much our rates are fluctiating over time. This can be done simply by taking our first deaths rate of change and subtracting the second days rate of change from that. This will mean if the rate goes up, then the change in that rate will go up as well!
find_dr_c <- function(data){
#Calulating the change in our rates of change
data <- data %>%
mutate(
D_RC_C = Deaths_RC - lag(Deaths_RC,k=1)
)
return (data)
}## SLIGHTLY DIFFERENT FROM BEFORE
italy <- italy %>%
find_dr_c()
tail(italy,5)## Date Deaths Deaths_PD Deaths_RC D_RC_C
## 27 2020-03-19 3405 427 0.1433848 -0.04638745
## 28 2020-03-20 4032 627 0.1841410 0.04075615
## 29 2020-03-21 4825 793 0.1966766 0.01253562
## 30 2020-03-22 5476 651 0.1349223 -0.06175431
## 31 2020-03-23 6077 601 0.1097516 -0.02517064
Great, here are Italys’ cumulative Deaths, Deaths per day, Deaths rate of change, and the change in the rate of change. All of this information will be used to work backwards at the end, to fill in the predictions.
First, lets take a look at the distribution of last 8 days’ Change in Rate.
##save og data!
og <- italy
split_for_graph <- function(data, split){
data <- data %>%
subset(!is.na(D_RC_C))
#taking the bottom 8 samples
data <- data[ceiling(nrow(data)-(split-1)) : nrow(data),]
mu = data %>%
subset(select = Deaths_PD) %>%
as.matrix() %>%
mean()
sd = data %>%
subset(select = Deaths_PD) %>%
as.matrix() %>%
sd()
#getting rid of outliers in the deaths per day
data <- data %>%
subset(mu-sd*3 < Deaths_PD & Deaths_PD < mu+sd*3)
mu <- data %>%
subset(select = D_RC_C) %>%
as.matrix() %>%
mean()
sd <- data %>%
subset(select = D_RC_C) %>%
as.matrix() %>%
sd()
#getting rid of outliers in the change of the rate of change in deaths
data <- data %>%
subset(mu-sd*3 < D_RC_C & D_RC_C < mu+sd*3)
return (data)
}
italy <- italy %>%
split_for_graph(8)hchart(density(italy$D_RC_C), type = "area", color = "red", name = "Rate of Deaths") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Distribion of the Change in Rate",
align = "left") %>%
hc_subtitle(text = "For Italys' Deceased Cases",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Change in the Rate of Change of Deaths"))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Rate of Change: ' + this.y)}"))This is the distribution of the rate of change, from a visual standpoint, there seems to be a slight dip at 0, suggesting that the changes skip over 0. This is a good thing, as the Change in Rate should always be changing, as long as the Cumulative Deaths are changing.
Lets do another Shapiro Wilk test on this, to check its normality.
##
## Shapiro-Wilk normality test
##
## data: italy$D_RC_C
## W = 0.89634, p-value = 0.2677
Exactly what we wanted to see, our p value is 0.2677 > 0.05, meaning our distribution is normal enough to use as a predictor!
Now that the Change in Rate has been calculated and tested, the prediction model can start to be built!
This process will be walked over in the code chunks, as I am building the process by hand. The overall process of the algorithm was mentioned at the start of this section.
######################
### functions used inside prediction model function
######################
# FUNCTION for getting the rate of change
get_rc <- function(death_rc_n1, change){
#rc = lag death_rc + change
rc=death_rc_n1+change
return(rc)
}
# FUNCTION for getting the Deaths per day
get_pd <- function(deaths_n1, deaths_rc){
#lag death*roc = pd
pd = (deaths_rc*deaths_n1)
if (pd <0){ #the per day cannot be negative
pd = pd * -1
}
return(ceiling(pd))
}
# FUNCTION for getting the Cumulative Deaths
get_death <- function(lag_death = lag_death, dpd){
#lag death + death per dat = next cumulative death
if(dpd <0){ #the per day cannot be negative
dpd <- dpd * -1
}
death = lag_death+dpd
return(ceiling(death))
}
### FUNCTION used for prediction model
prediction_model <- function(data, days, trials, split, curr_d){
#getting the current day inside the data passed
curr_d = max(data[data$D_RC_C != 0,"Date"])
for (n in 1:trials){ #loop for num of simulations
temp <- data %>% #create a ned df for each sim
subset(select = c(Date, D_RC_C))
for (i in 1:days){ #loop for num of pred days
mu <- temp %>% #grabbing mean
subset(select = D_RC_C) %>%
slice(nrow(temp)-(nrow(temp)-split):nrow(temp)+i) %>%
as.matrix() %>%
mean()
sd <- temp %>% #grabbing standard deviation
subset(select = D_RC_C) %>%
slice(nrow(temp)-(nrow(temp)-split):nrow(temp)+i) %>%
as.matrix() %>%
sd()
set.seed(i*13*n) #setting a changing seed
temp <- temp %>% #adding a new row to the df
bind_rows(data.frame(Date = as.Date(curr_d, format = "%Y-%m-%d")+i, D_RC_C = rnorm(1, mu, sd))) %>%
mutate(
Trial = n #adding the trial number
)
}
if (n == 1){ #to create the new trial data
trial_data <- temp
}
else{ #to combine the trial data
trial_data <- temp %>%
bind_rows(trial_data)
}
}
return (trial_data) #FINISHED
}
#Grabbing the current day for usage
curr_d <- max(italy[italy$D_RC_C != 0,"Date"])
TRIALS <- 100 #set number of trials
DAYS <- 10 #set number of days
SPLIT <- 8 #set number of window
italy <- italy %>%
prediction_model(days = DAYS, trials = TRIALS, split = SPLIT, curr_d) %>%
group_by(Trial, Date) %>%
slice(n()) %>%
ungroup() %>%
mutate(
Deaths = NA,
Deaths_PD = NA,
Deaths_RC = NA,
Country = "Italy"
)hc<- italy %>%
subset(select= c(D_RC_C, Date, Trial, Deaths)) %>%
subset(Trial == 1) %>%
mutate(
name = "Simulation"
) %>%
subset(Date >= curr_d) %>%
hchart(type = "line", hcaes(x=Date, y=D_RC_C), name = "Simulated", color = "black")
p_og <- og %>%
mutate(
name = "Actual"
)
hc <- hc %>%
hc_add_series(data = p_og, type = "line", color = "red", hcaes(x=Date, y=D_RC_C), name= "Actual") %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Projected Change in Rates",
align = "left") %>%
hc_subtitle(text = "#Simulations = 1, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Change in Rate of Deaths")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Predicted Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Change in Rate: ' + this.y +
' <br> Deaths: ' + this.point.Deaths + ' <br> Data: ' + this.point.name)}"))
hcThis graph shows just one Trial of the Change in Rates. For this test model, I ran a total of 20 simulations, so keep in mind, each individual simulation will look different.
As shown, in the beginning of the data, where the Cumulative Deaths were still very low, the variation of those changes were very drastic, over time the Changes started to slow down, so this algorithm is looking at the last 8 days of the actual data, and keeping the revolving door moving, it should start to even out over time, converging to 0. When the Changes in the Rates converge to 0, this means that the Deaths per day are staying stagnant, presumably this will happen when the disease has done a few things:
Now that all the Changes in Rates have been predicted, lets’ run the algorithm to calculate backwards to fill in the blanks!
# FUNCTION used to calculate the rate of change, deaths per day, and cumulative deaths
calc_deaths <- function(data, trials, curr_d){
for (i in 1:trials){ #run for num of trials
for (d in data$Date){ #run for num of days
if (d > as.Date(curr_d, format = "%Y-%m-%d")){
#getting roc
data[data$Date == d & data$Trial == i,6] <- get_rc(data[data$Date == d-1 & data$Trial == i,6], data[data$Date == d & data$Trial == i,2])
#getting deaths per day
data[data$Date == d & data$Trial == i,5] <- get_pd(data[data$Date == d-1 & data$Trial == i,4], data[data$Date == d & data$Trial == i,6])
#getting cumulative deaths
data[data$Date == d & data$Trial == i,4] <- get_death(data[data$Date == d-1 & data$Trial == i,4], data[data$Date == d & data$Trial == i,5])
}
}
}
return (data) #FINISHED
}
italy <- italy %>%
subset(Date >= as.Date(curr_d, format = "%Y-%m-%d"))
for (d in og$Date){ #fill in the known values
italy[italy$Date == d,"Deaths"] <- og[og$Date == d,"Deaths"]
italy[italy$Date == d,"Deaths_PD"] <- og[og$Date == d,"Deaths_PD"]
italy[italy$Date == d,"Deaths_RC"] <- og[og$Date == d,"Deaths_RC"]
}
italy <- italy %>% #function call
calc_deaths(TRIALS, curr_d)Awesome, now all the blanks are filled in marking the end of the Algorithms’ job! Lets start with a quick Analaysis of these results!
It is important to note a few things about Monte Carlo Algorithms:
hc <- highchart()
pn <- italy %>%
subset(Date >= curr_d)
for (i in 1:TRIALS){
hc <- hc %>%
hc_add_series(name = paste("SImulation", as.character(i), sep=" "), data = pn[pn$Trial == i,c("Deaths", "Date", "Trial")], type = "line", hcaes(x=Date, y=Deaths), showInLegend = F)
}
hc <- hc %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Simulated Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Cumulative Number of Deaths")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Simulated Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left") %>%
hc_add_series(name = "Actual Data", data = og, type = "line", hcaes(x=Date, y=Deaths), color = "black") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Number of Deaths: ' + this.y + ' <br> Simulation: ' + this.point.Trial)}"))
hcRemember: You can deselect the actual data to get a closer look at the simulation runs.
As shown, these are the 100 simulations for a 10 day forecast of Italys’ Cumulative Deaths. There are a few things to note from this graph:
There seems to be a small clustering of simulations that heaviny exponentiated.
The majority of the simulations appears to be concentrated at a slower exponential rate.
Lets take a deeper look into these numbers.
hc <- italy %>%
subset(Date > curr_d) %>%
hchart(type = "scatter", hcaes(x=Date,y=Deaths, color = Trial),
name = "Simulation") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Number of Deaths: ' + this.y + ' <br> Simulation: ' + this.point.Trial)}"))
hc <- hc %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_series(name = "Actual Data", data = og[og$Date==curr_d,], type = "scatter", hcaes(x=Date, y=Deaths), color = "black") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Simulated Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Cumulative Number of Deaths")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Simulated Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left")
hcHere we can see the clustering of our Simulations. It seems the farther the forecast, generally the less clustered it becomes. This means that the Variance is growing as we predict the days, something that is expected as this is using a Markov Technique.
A few notes about this graph:
Lets take a look at the distribution of these indivdual days!
tplotd <- italy %>%
subset(Date == curr_d + 1)
hc <- hchart(density(tplotd$Deaths), type = "area", name = paste("Prediction Day #", as.character(1), sep=""))
for (i in 2:DAYS){
tplotd <- italy %>%
subset(Date == curr_d + i)
hc <- hc %>%
hc_add_series(density(tplotd$Deaths), type = "area", name = paste("Simulated Day #", as.character(i), sep=" ")) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Distribution of Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Dristribution")) %>%
hc_xAxis(title = list(text = "Cumulative Deaths")) %>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(i,curr_d){
return ('Number of Deaths: ' + this.x)}"))
}
hcRemember: You can deselect the simulation days to get a better look at individual days, or to compare a few days.
This shows what exactly is going on with each day that the algorithm predicts.
Now lets take a look the individual rates!
hc <- italy %>%
subset(Date >= curr_d) %>%
hchart(type = "scatter", hcaes(x=Date,y=Deaths_RC, color = Trial),
name = "Simulation")
val <- og %>%
subset(Date == curr_d, select = Deaths_RC) %>%
as.numeric()
hc <- hc %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_series(name = "Actual Data", data = og, type = "line", hcaes(x=Date, y=Deaths_RC), color = "black") %>%
hc_yAxis(title = list(text = "Rate of Change in Deaths"),
plotLines = list(list(
value = val,
color = '#ff0000',
width = 3,
zIndex = 4,
label = list(text = "Last Known Rate",
style = list( color = '#ff0000', fontWeight = 'bold' ))))) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Rate of Change in Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Rate of Change of Cumulative Deaths")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Predicted Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Rate of Change: ' + this.y +
' <br> Deaths per day: ' + this.point.Deaths_PD +
' <br> Simulation: ' + this.point.Trial)}"))
hcNow it is obvious that there is a clear split between rates that are trending downwards and rates that are climbing upwards! This is great news, as it can generally implying that the Rate of Change will start to climb down. This can suggest a few things:
One thing that is possible with this algorithm, is changing the hyperparameters.
I am going to import a dataset of this algorithm, that was ran with more trials and different hyperparameters. I want to specifically limit the range to a 6 day forecast, as we know the Algorithm gets more inacurate with each consecutive day.
files <- list.files("predictions", full.names = TRUE)
pred_data <- files %>% #read in files
lapply(function(x){
read.csv(x) %>%
mutate(
Date = as.Date(Date, "%Y-%m-%d")
)
}) %>%
bind_rows()
plot_data <- pred_data %>%
subset(Split == c(4, 6, 8, 10, 30)) %>%
subset(Date > curr_d) %>%
mutate(
group = paste("Window:",as.character(Split),sep=" ")
)
hc <- hcboxplot(x = plot_data$Deaths_RC, var = plot_data$Date, var2 = plot_data$group,
outliers = FALSE) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Simulated Deaths Rate of Change",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 6, legend = #days for mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Rate of Change in Deaths, per day")) %>%
hc_xAxis(title = list(text = "Date"), label = list(day = '%d/%b'))%>%
hc_legend(align = "left")Remember: You can deselect the the Windows in the legend, to get a closer comparison between each hyperparameter.
This shows that a revolving mean and standard deviation can have completely different effects on the model, depending on what parameters are chosen.
This chart shows variation of the data, that appears to explode when anything over 10 is chosen. * This is most likely due to the fact that the rates were vastly different in the beginning stages of the spread, as the overall numbers are lower. + Which suggests that we should be looking at a window that is less than 10, for better results.
There is one thing I want to point out, when observing the window of 30 days, the median Rate of Change is the lowest one out of all other options, for all except the first day. This suggests, that although the variance between the trials is higher, the median is still converging to a Rate of Change.
Lets take a look deeper look at the simulations with windows that are less than 10 days.
Remember: You can deselect the the Windows in the legend, to get a closer comparison between each hyperparameter.
It seems that nearly every simulation showed a median change that is negative. This would highly indicate that the Change in Deaths per day is slowing down, meaning that the Cumulative Deaths is starting to slow down!
lets take a look at the mean Deaths for the Trials, and compare that with the actual data that is now available for italy!
pred_mean <- pred_data %>%
subset(Date > curr_d) %>%
group_by(Split, Date) %>%
summarise(
mean_D = ceiling(mean(Deaths)),
mean_D_RC = mean(Deaths_RC),
mean_D_RC_C = mean(D_RC_C),
sd_D = sd(Deaths),
sd_D_RC = sd(Deaths_RC),
sd_D_RC_C = sd(D_RC_C)
)
files <- list.files("future_data", full.names = TRUE)
new_og <- files %>% #read in files
lapply(function(x){
read.csv(x) %>%
mutate(
Date = as.Date(Date, "%Y-%m-%d")
)
}) %>%
bind_rows()
plot_d <- pred_mean %>%
subset(Split < 10) %>%
mutate(
group = paste("Window:",as.character(Split),sep=" ")
)
plot_d <- new_og %>%
rename(
mean_D = Deaths,
mean_D_RC = Deaths_RC,
mean_D_RC_C = D_RC_C
) %>%
mutate(
group = "Actual Data"
) %>%
bind_rows(plot_d)
hc <- hchart(plot_d, "column", hcaes(x = Date, y = mean_D, group = group), color =c("red", "lightblue", "lightblue", "lightblue", "lightblue", "lightblue", "lightblue", "lightblue")) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Comparison of Simulations Mean Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 6, legend = #days for mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Mean Cumulative Deaths")) %>%
hc_xAxis(title = list(text = "Date"), label = list(day = '%d/%b'))%>%
hc_legend(align = "left")
hcRemember: You can deselect the the Windows in the legend, to get a closer comparison between each hyperparameter.
This chart shows the actual Cumulative deaths for March 24th - 29th, as well as the simulated values. This tells us a few things about the simulations: